%autosave 0
DATA-Returns-MarketDATA-Returns-SectorsDATA-Close-MarketDATA-Close-SectorsDATA-Fundamentals-Market %matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import re
import os
import sys
import time
import quandl
# import hdbscan
import logging
import datetime
import platform
import collections
import numpy as np
import pandas as pd
import pickle as pkl
from glob import glob
from scipy import stats
from scipy import linalg
from scipy.linalg import (
toeplitz, cholesky
)
from textwrap import wrap
from datetime import datetime
from pandas import DataFrame
from six.moves.urllib.request import urlopen
from six.moves.urllib.parse import urlencode
from statsmodels.tsa.stattools import coint
from matplotlib.collections import LineCollection
from sklearn import (
cluster,
covariance,
manifold,
preprocessing
)
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.covariance import (
OAS,
GraphLasso,
LedoitWolf,
log_likelihood,
ShrunkCovariance,
empirical_covariance
)
from sklearn.model_selection import GridSearchCV
quandl.ApiConfig.api_key = 'vP-UcGRwvsHvnr9cxtTc'
operating_system = platform.system()
operating_system
if operating_system == 'Windows':
folder_delimiter = r'\\'
elif operating_system == 'Linux':
folder_delimiter = r'/'
else:
print("Unknown operating system.")
sectorsfolder = r'DATA-Sectors' + folder_delimiter
dictfolder_market = r'DATA-Dictionaries' + folder_delimiter
dictfolder_sectors = r'DATA-Dictionaries-Sectors' + folder_delimiter
returnsfolder_market = r'DATA-Returns-Market' + folder_delimiter
returnsfolder_sectors = r'DATA-Returns-Sectors' + folder_delimiter
fundamentalsfolder = r'DATA-Fundamentals-Market' + folder_delimiter
closefolder_market = r'DATA-Close-Market' + folder_delimiter
closefolder_sectors = r'DATA-Close-Sectors' + folder_delimiter
openclosefolder_sectors = r'DATA-OpenClose-Sectors' + folder_delimiter
closefolder_sectors_cleaned = r'DATA-Close-Sectors-Cleaned' + folder_delimiter
marketcap_filepath = fundamentalsfolder + "marketcap.xlsx"
pricetobook_filepath = fundamentalsfolder + "marketcap.xlsx"
pricetoearnings_filepath = fundamentalsfolder + "marketcap.xlsx"
enterprisevalue_filepath = fundamentalsfolder + "marketcap.xlsx"
print ("Numpy: %s " % np.__version__)
print ("Pandas: %s " % pd.__version__)
# Input:
# X should be standard returns data
# with features as column headers
# and rows as observation data.
def build_affinity_clusters(stock_universe, X, pricing=None):
np.warnings.filterwarnings('ignore')
# Neat trick: elementwise (i.e. row) arithmetic
# to standardize the *column* values. Works
# because std() returns a series that functions
# much like a row.
X /= X.std(axis=0)
emp_cov, shrunk_cov, lw, oa = get_covariance(X)
try:
# alpha = 0.01
# cov, icov, costs = covariance.graph_lasso(
# emp_cov, # lw.covariance_
# alpha,
# return_costs=True)
edge_model = covariance.GraphLassoCV()
edge_model.fit(X)
cov = edge_model.covariance_
icov = edge_model.precision_
except FloatingPointError:
print("Failed at alpha=%s"%alpha)
return
# dimensions of precision matrix = NxN,
# where N = number of stocks
_, labels = cluster.affinity_propagation(cov)
n_labels = labels.max()
# print("Cluster labels=%s" % labels)
# map of ticker symbols to cluster numbers
cluster_map = pd.Series(
index=X.columns,
data=labels)
# get the number of stocks in each cluster
# NOTE: value_counts of a series yields another series
counts = cluster_map.value_counts()
nclusters = len(counts)
print("\n")
print("Number of clusters detected: %d" % nclusters)
max = counts.max()
print("Cluster max size: %d" % max)
# Unlike DBSCAN, Affinity clustering doesn't not
# attempt to filter out noise (in particular, it
# does not place a lower limit on the size of a cluster)
bigger_than_one = counts[counts > 1]
# select 3 largest clusters
highest3clusters = bigger_than_one.nlargest(3).index
highest3counts = bigger_than_one[highest3clusters]
highest3counts = list(highest3counts)
# print("\n")
print("\n3 cluster labels with the highest counts=%s" % list(highest3clusters))
print("3 highest counts=%s" % highest3counts)
plt.barh(
highest3clusters,
highest3counts
)
plt.title('Top 3 Cluster Member Counts')
plt.xlabel('Stocks in Cluster')
plt.ylabel('Cluster Number');
# If pricing data is provided, plot the time series
# of the stocks in the 3 smallest clusters.
if pricing is not None:
# select 3 smallest clusters
smallest3clusters = bigger_than_one.nsmallest(3).index
smallest3counts = bigger_than_one[smallest3clusters]
smallest3counts = list(smallest3counts)
print("\nPlot the price series of stocks in smallest clusters.\n")
print("3 cluster labels with the smallest counts=%s" % list(smallest3clusters))
print("3 smallest counts=%s" % smallest3counts)
for n, clust in enumerate(smallest3clusters):
# find indices of tickers in this cluster
idx = cluster_map[cluster_map==clust].index
tickers = list(idx)
print("Tickers in cluster %d: %s" % (clust, tickers))
# If the cluster size is > 1, graph the time series
# of its stocks
if smallest3counts[n] > 1:
# subtract the mean of each price series
means = np.log(pricing[tickers].mean())
data = np.log(pricing[tickers]).sub(means)
title='Stock Time Series for Cluster %d' % clust
data.plot(title=title)
output_tuple = (cov, icov, labels, counts, cluster_map)
opfilename = "Output\\" + stock_universe + ".pkl"
with open(opfilename, "wb") as fd:
pkl.dump(output_tuple, fd)
# return (cov, icov, labels, counts, cluster_map)
return output_tuple
Huge advantage of DBSCAN clustering is that it identifies noise better than many other clustering algorithms and chooses not to include some data points in any cluster. Tickers that are rejected by DBSCAN will be assigned the pseudo-cluster number of -1.
Discard all tickers with a cluster value of -1.
clf.labels_ contains a simple 1D array (or vector) of cluster numbers. The index number of each element (or if you want to view it as a column vector, the row number of each element) corresponds to the index number of the ticker in X.
def build_dbscan_clusters(X, tickers, pricing=None):
clf = DBSCAN(eps=1.5, leaf_size=30, min_samples=3)
print (clf)
clf.fit(X)
dbscan_labels = clf.labels_
# print("Cluster labels=%s" % dbscan_labels)
# map of ticker symbols to cluster numbers
dbscan_cluster_map_all = pd.Series(
index=tickers,
data=dbscan_labels)
# tickers associated with cluster -1 are noise
# remove them
dbscan_cluster_map = dbscan_cluster_map_all[
dbscan_cluster_map_all != -1]
# get the number of stocks in each cluster
# NOTE: value_counts of a series yields another series
dbscan_counts = dbscan_cluster_map.value_counts()
dbscan_nclusters = len(dbscan_counts)
# print("\n")
print("Number of DBSCAN clusters detected: %d" % dbscan_nclusters)
# number of pairs we can create from clusters
npairs = (dbscan_counts * (dbscan_counts - 1)).sum()
print("Number of pairs we can create from DBSCAN clusters: %d" % npairs)
# We can't access dictionary keys using values in a deterministic
# manner, because potentially different keys could be associated
# with the same value. But we're must looking for a sample of the
# largest clusters, so that doesn't matter here.
max = dbscan_counts.max()
num_largestcluster = dict((v,k) for k,v in dbscan_counts.items()).get(max)
print ("Maximum cluster size (in cluster %d): %d" % (
num_largestcluster, max))
# Clusters with 1 member are not interesting
bigger_than_one = dbscan_counts[counts > 1]
# select 3 largest clusters
highest3clusters = bigger_than_one.nlargest(3).index
highest3counts = bigger_than_one[highest3clusters]
highest3counts = list(highest3counts)
print("\n3 cluster labels with the highest counts=%s" % list(highest3clusters))
print("3 highest counts=%s" % highest3counts)
plt.barh(
highest3clusters,
highest3counts
)
plt.title('Top 3 Cluster Member Counts')
plt.xlabel('Stocks in Cluster')
plt.ylabel('Cluster Number');
# If pricing data is provided, plot the time series
# of the stocks in the 3 smallest clusters.
if pricing is not None:
# select 3 smallest clusters
smallest3clusters = bigger_than_one.nsmallest(3).index
smallest3counts = bigger_than_one[smallest3clusters]
smallest3counts = list(smallest3counts)
print("\nPlot the price series of stocks in smallest clusters.\n")
print("3 cluster labels with the smallest counts=%s" % list(smallest3clusters))
print("3 smallest counts=%s" % smallest3counts)
for n, clust in enumerate(smallest3clusters):
# find indices of tickers in this cluster
idx = dbscan_cluster_map[dbscan_cluster_map==clust].index
tickers = list(idx)
print("Tickers in cluster %d: %s" % (clust, tickers))
# If the cluster size is > 1, graph the time series
# of its stocks
if smallest3counts[n] > 1:
# subtract the mean of each price series
means = np.log(pricing[tickers].mean())
data = np.log(pricing[tickers]).sub(means)
title='Stock Time Series for Cluster %d' % clust
data.plot(title=title)
return (dbscan_cluster_map_all,
dbscan_cluster_map,
dbscan_counts,
dbscan_labels
)
def list_clusters(tickers, clusterlabels):
# map of ticker symbols to cluster numbers
cluster_map = pd.Series(
index=tickers.columns,
data=clusterlabels)
dic = {}
for x in clustermap.index:
cluster = clustermap[x]
if cluster in dic.keys():
dic[cluster].append(x)
else:
dic[cluster] = [str(x)]
return dic
find_cointegrated_pairs: find cointegrated pairs in a cluster.
# Acknowledgement:
# This function was created by Delanie MacKensie and
# posted on the Quantopian website here:
# https://www.quantopian.com/lectures/introduction-to-pairs-trading
def find_cointegrated_pairs(data, significance=0.05):
n = data.shape[1]
score_matrix = np.zeros((n, n))
pvalue_matrix = np.ones((n, n))
keys = data.keys()
pairs = []
for i in range(n):
for j in range(i+1, n):
S1 = data[keys[i]]
S2 = data[keys[j]]
result = coint(S1, S2)
score = result[0]
pvalue = result[1]
score_matrix[i, j] = score
pvalue_matrix[i, j] = pvalue
if pvalue < significance:
pairs.append((keys[i], keys[j]))
return score_matrix, pvalue_matrix, pairs
# generate dictionary of cointegrated pairs
def gen_dictionary_coint_pairs(pricing, cluster_map, counts):
cluster_dict = {}
aggregated_fraction = 0
n = 0
for i, clust in enumerate(counts.index):
# fetch the tickers for this cluster
tickers = cluster_map[cluster_map == clust].index
num_tickers = len(tickers)
if num_tickers < 2:
continue
score_matrix, pvalue_matrix, pairs = find_cointegrated_pairs(
pricing[tickers]
)
# build a dictionary whose keys are cluster numbers
# each whose elements contains yet another dictionary
# with keys 'pairs', 'score_matrix' and 'pvalue_matrix'
# Values of the keys in the nested dictionary will be
# as follows
# -- 'pairs' key: A list of pairs
# -- 'score_matrix' key: An array of pvalues
# -- 'pvalue_matrix' key: An array of scores
num_pairs = len(pairs)
num_potential_pairs = num_tickers * (num_tickers - 1) / 2
# print("num_pairs=%d" % num_pairs)
# print("num_tickers=%d" % num_tickers)
# print("num_potential_pairs=%d" % num_potential_pairs)
# fraction of pairs in cluster that cointegrate
f = num_pairs / num_potential_pairs
cluster_dict[clust] = {}
cluster_dict[clust]['score_matrix'] = score_matrix
cluster_dict[clust]['pvalue_matrix'] = pvalue_matrix
cluster_dict[clust]['pairs'] = pairs
cluster_dict[clust]['cluster_size'] = num_tickers
cluster_dict[clust]['number_potential_pairs'] = num_potential_pairs
cluster_dict[clust]['number_of_pairs'] = num_pairs
cluster_dict[clust]['fraction_successful'] = f
aggregated_fraction = aggregated_fraction + f
n = n + 1
aggregated_fraction /= n
return cluster_dict, aggregated_fraction
def aggregate_pairs_allclusters(stock_universe, pricing, cluster_map, counts):
cluster_dict, fraction_successful = gen_dictionary_coint_pairs(
pricing,
cluster_map,
counts)
# aggregate cointegrated pairs from all clusters
aggregated_pairs = []
for clust in cluster_dict.keys():
aggregated_pairs.extend(cluster_dict[clust]['pairs'])
print("We found %d pairs." % len(aggregated_pairs))
print("Containing %d unique tickers." % len(
np.unique(aggregated_pairs)))
print("Fraction (on the average) of pairs that cointegrate = %f" % (
fraction_successful))
opfilename = "Output\\" + stock_universe + ".clust_dict.pkl"
with open(opfilename, "wb") as fd:
pkl.dump(cluster_dict, fd)
opfilename = "Output\\" + stock_universe + ".pairs.pkl"
with open(opfilename, "wb") as fd:
pkl.dump(aggregated_pairs, fd)
return (aggregated_pairs, cluster_dict)
partially_correlated_pairs
Return DataFrame with partially correlated pairs, their ticker symbols and their partial correlation value.
def partially_correlated_pairs(stock_universe, returns, icov, minvalue):
num_stocks = len(returns.columns)
idx_to_stock = pd.Series(
index=np.arange(num_stocks),
data=returns.columns)
filtered = (np.abs(np.triu(icov, k=1)) > minvalue)
values = np.abs(icov[filtered])
x, y = np.where(filtered)
indices = list(zip(x, y))
df = pd.DataFrame()
df['indices'] = indices
df['values'] = values
s = pd.Series(data=np.zeros(len(df)))
f = lambda tup: tuple(idx_to_stock[x] for x in tup)
for i in df.index:
t = df.loc[i]['indices']
pair = f(t)
s.loc[i] = pair
df['pairs'] = s
df
df = df.sort_values('values', ascending=False)
corrval = "%.3f" % minvalue
opfn = "Output\\" + stock_universe
opfn = opfn + ".pairs.incov." + corrval
opfilename = opfn + ".pkl"
with open(opfilename, "wb") as fd:
pkl.dump(df, fd)
return df
find_coint_partially_corr_pairs:
Check partially correlated pairs identified by the precision matrix to see if any of them are cointegrated.
def find_coint_partially_corr_pairs(df):
significance = 0.05
pairs = []
for x in df['pairs']:
pair = list(x)
ticker1 = pair[0]
ticker2 = pair[1]
S1 = pricing[ticker1]
S2 = pricing[ticker2]
result = coint(S1, S2)
score = result[0]
pvalue = result[1]
if pvalue < significance:
pairs.append((ticker1, ticker2))
num_potential_pairs = len(df)
if num_potential_pairs < 1:
print("Finding cointegrated pairs of partially correlated stocks:")
print("Number of potentially cointegrated pairs = %d" % (
num_potential_pairs))
return
num_pairs = len(pairs)
print("Finding cointegrated pairs of partially correlated stocks:")
print("Number of potentially cointegrated pairs = %d" % (
num_potential_pairs))
print("Number of conintegrated pairs = %d" % num_pairs)
str_fraction = "%.3f" % (num_pairs / num_potential_pairs)
print("Fraction of partially corr pairs that cointegrate = %s" % (
str_fraction))
return pairs
Test soundness of covariance matrix
def negative_eigenvalues(cov):
w, v = linalg.eig(cov)
return np.any(np.less(w, 0))
def maxmin_eigenvalues(cov):
w, v = linalg.eig(cov)
return np.max(w), np.min(w)
def get_covariance(X):
# Empirical Covariance
# X : ndarray of shape (n_samples, n_features)
emp_cov = covariance.empirical_covariance(X)
shrunk_cov = covariance.shrunk_covariance(
emp_cov, shrinkage=0.9)
X_T = X.T.copy()
try:
train_len = int(0.9 * len(X_T)) + 1
test_len = int(0.1 * len(X_T))
X_train = X_T[:train_len]
X_test = X_T[train_len:]
assert (len(X_train) + len(X_test) == len(X_T))
# Ledoit-Wolf optimal shrinkage coefficient estimate
lw = LedoitWolf()
lw.fit(X_train)
# OAS coefficient estimate
oa = OAS()
oa.fit(X_train)
return emp_cov, shrunk_cov, lw, oa
except AssertionError:
print("My Error: Size of train and test sets don't add up to propert value.")
# Input:
# X should be standard returns data
# with features as column headers
# and rows as observation data.
def get_LLE_embeddings(X):
# Neat trick: elementwise (i.e. row) arithmetic
# to standardize the *column* values. Works
# because std() returns a series that functions
# much like a row.
X /= X.std(axis=0)
model = manifold.LocallyLinearEmbedding(
n_components = 2,
eigen_solver = 'dense',
n_neighbors = 6)
# If tickers label columns, we must transpose X
# If tickers label rows, no need to transpose
# embedding = node_position_model.fit_transform(X.T).T
# embedding = node_position_model.fit_transform(returns.T)
embedding = model.fit_transform(X.T)
print("embedding.shape=%s" % str(embedding.shape))
# We don't transpose embedding because plt.scatter()
# requires a transpose of its dimensions but because
# it requires two lists that happen to correspond to
# the columns of embedding. By transposing embedding
# we can access the two lists by simplying selecting
# the rows with indices 0 and 1.
embed0 = embedding.T[0]
embed1 = embedding.T[1]
print("embed0.shape=%s" % str(embed0.shape))
print("embed1.shape=%s" % str(embed1.shape))
return embed0, embed1, embedding
def get_TSNE_embeddings(X):
# If tickers label columns, we must transpose X
# If tickers label rows, no need to transpose
# embedding = node_position_model.fit_transform(X.T).T
# embedding = node_position_model.fit_transform(returns.T)
embedding = TSNE(learning_rate=1000, perplexity=25,
random_state=1337).fit_transform(X)
print("embedding.shape=%s" % str(embedding.shape))
# We don't transpose embedding because plt.scatter()
# requires a transpose of its dimensions but because
# it requires two lists that happen to correspond to
# the columns of embedding. By transposing embedding
# we can access the two lists by simplying selecting
# the rows with indices 0 and 1.
embed0 = embedding.T[0]
embed1 = embedding.T[1]
print("embed0.shape=%s" % str(embed0.shape))
print("embed1.shape=%s" % str(embed1.shape))
return embed0, embed1, embedding
def do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=1000000):
# d = 1 / np.sqrt(np.diag(icov))
ax = fig.add_subplot(1,1,1)
plt.axis('off')
# sizes = dotsize * np.linspace(0, 1, len(embed0))
sizes = dotsize * counts.values
ax.scatter(embed0, embed1, s=sizes, c=labels, cmap=plt.cm.nipy_spectral)
# ax.scatter(embed0, embed1, s=dotsize * d ** 2,
# c=labels, cmap=plt.cm.nipy_spectral)
def do_noisy_scatter_plot(x, y, x_noise, y_noise, labels, fig):
# plt.clf()
# plt.axis('off')
ax = fig.add_subplot(1,1,1)
ax.scatter(
x,
y,
s=100,
alpha=0.85,
c=labels[labels!=-1],
cmap=cm.Paired
)
ax.scatter(
x_noise,
y_noise,
s=100,
alpha=0.05
)
title = "T-SNE of all Stocks with DBSCAN Clusters Noted"
plt.title(title);
def do_LLE_graph(X, icov, labels, embedding, fig):
# Display a graph of the partial correlations
# partial_correlations = edge_model.precision_.copy()
# d = 1 / np.sqrt(np.diag(partial_correlations))
n_labels = len(labels)
ax = fig.add_subplot(1,1,1)
d = 1 / np.sqrt(np.diag(icov))
icov *= d
icov *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(icov, k=1)) > 0.02)
if not non_zero.any():
print("No off-diagonal elements found in the precision matrix.")
return
# plt.figure(1, facecolor='w', figsize=(10, 8))
# plt.clf()
# ax = plt.axes([0., 0., 1., 1.])
plt.axis('off')
embedding = embedding.T
embed0 = embedding[0]
embed1 = embedding[1]
# plt.scatter(embed0, embed1, s=100 * d ** 2,
ax.scatter(embed0, embed1, s=100 * d ** 2,
c=labels, cmap=plt.cm.nipy_spectral)
# tickers = returns.T.index
tickers = X.T.index
names = list(tickers)
# Plot the edges
start_idx, end_idx = np.where(non_zero)
segments = [[embedding[:, start], embedding[:, stop]]
for start, stop in zip(start_idx, end_idx)]
values = np.abs(icov[non_zero])
lc = LineCollection(segments,
zorder=0, cmap=plt.cm.hot_r,
norm=plt.Normalize(0, .7 * values.max()))
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)
# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
for index, (name, label, (x, y)) in enumerate(
zip(names, labels, embedding.T)):
dx = x - embedding[0]
dx[index] = 1
dy = y - embedding[1]
dy[index] = 1
this_dx = dx[np.argmin(np.abs(dy))]
this_dy = dy[np.argmin(np.abs(dx))]
if this_dx > 0:
horizontalalignment = 'left'
x = x + .002
else:
horizontalalignment = 'right'
x = x - .002
if this_dy > 0:
verticalalignment = 'bottom'
y = y + .002
else:
verticalalignment = 'top'
y = y - .002
plt.text(x, y, name, size=10,
horizontalalignment=horizontalalignment,
verticalalignment=verticalalignment,
bbox=dict(facecolor='w',
edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
alpha=.6))
plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
embedding[0].max() + .10 * embedding[0].ptp(),)
plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
embedding[1].max() + .03 * embedding[1].ptp())
# plt.show()
# For the PCA/DBSCAN case, we have a 50+ dimensional
# representation of the stocks, and we will embed those
# 50+ dimensional vectors in 2D space using TSNE, then
# we will plot the points and add lines connecting
# points within each pair.
def visualize_cointegrated_pairs(X, returns, embedding, pairs, cluster_map, fig):
if len(pairs) == 0:
print("No pairs to visualize.")
return
ax = fig.add_subplot(1,1,1)
plt.axis('off')
# map of tickers found in pairs to their clusters
stocks = list(np.unique(pairs))
in_pairs_series = cluster_map.loc[stocks]
print("Size of map: %s" % str(in_pairs_series.shape))
# build DataFrame of returns for stocks included
# in cointegrated pairs
# X_df has tickers for its index, so X_df.loc[stocks]
# will yield vectors of dimension 54 (principal components
# of dimension 50 + 4 dimensions for fundamentals) for
# all stocks found in some pair or other.
X_df = pd.DataFrame(index=returns.T.index, data=X)
# X_pairs is not a list of pairs, but a list of the
# stock tickers found within some pair or other. It is
# a subset of of X_df and has the same DataFrame format.
X_pairs = X_df.loc[stocks]
print("X_pairs.shape=%s" % str(X_pairs.shape))
# Get the embedded representations of all the stocks
# present in one or more of the cointegrated pairs
emb = pd.DataFrame(columns=['x', 'y'])
for pair in pairs:
# Row numbers of array X_tsne correspond to index
# numbers of tickers in X_pairs. Each row of X_tsne
# contains a 2-tuple with coordinates for the ticker
# in embedded T-SNE space.
# ticker1 = pair[0].symbol << Quantopian ticker object
ticker1 = pair[0]
loc1 = X_pairs.index.get_loc(pair[0])
x1, y1 = embedding[loc1, :]
emb.loc[loc1] = [x1, y1]
# ticker2 = pair[0].symbol << Quantopian ticker object
ticker2 = pair[1]
loc2 = X_pairs.index.get_loc(pair[1])
x2, y2 = embedding[loc2, :]
emb.loc[loc2] = [x2, y2]
# plot the lines between the dots that will be
# drawn later after this loop completes
plt.plot([x1, x2], [y1, y2], 'k-', alpha=0.3, c='gray');
cmvals = in_pairs_series.values / max(in_pairs_series.values)
plt.scatter(emb['x'], emb['y'], s=220,
# alpha=0.9, c=[in_pairs_series.values],
alpha=0.9, c=cmvals,
cmap=cm.Paired)
plt.title('T-SNE Visualization of Validated Pairs');
def do_data_scaling(X_Clean, year_range, num_obs):
myScaler = preprocessing.StandardScaler()
XScaled = myScaler.fit_transform(X_Clean)
X = XScaled.copy()
return X
def get_market_returns_data(returns_file):
returns_filepath = returnsfolder_market + returns_file
with open(returns_filepath, 'rb') as fd:
returns = pkl.load(fd)
print("returns.shape=%s" % str(returns.shape))
return returns
def get_market_pricing_data(closing_prices_file):
pricing_filepath = closefolder_market + closing_prices_file
with open(pricing_filepath, 'rb') as fd:
pricing = pkl.load(fd)
print("pricing.shape=%s" % str(pricing.shape))
return pricing
def get_sector_returns_data(returns_file):
returns_filepath = returnsfolder_sectors + returns_file
with open(returns_filepath, 'rb') as fd:
returns = pkl.load(fd)
print("returns.shape=%s" % str(returns.shape))
return returns
def get_sector_pricing_data(closing_prices_file):
pricing_filepath = closefolder_sectors + closing_prices_file
with open(pricing_filepath, 'rb') as fd:
pricing = pkl.load(fd)
print("pricing.shape=%s" % str(pricing.shape))
return pricing
universe = "EntireMarket"
returns_file = "returns_2000-2008_len_2485.pkl"
closing_prices_file = "pricing_2000-2008_len_2485.pkl"
returns = get_market_returns_data(returns_file)
pricing = get_market_pricing_data(closing_prices_file)
For PCA, prune returns and pricing so that they hold only those stocks for which we have fundamentals data.
returnsUnpruned = returns.copy()
pricingUnpruned = pricing.copy()
retT = returns.T
pricingT = pricing.T
marketcap_filepath = fundamentalsfolder + "marketcap.xlsx"
pricetobook_filepath = fundamentalsfolder + "price_to_book.xlsx"
pricetoearnings_filepath = fundamentalsfolder + "price_to_earnings.xlsx"
enterprisevalue_filepath = fundamentalsfolder + "enterprise_value.xlsx"
marketcap = pd.read_excel(marketcap_filepath)
marketcap = marketcap.set_index('Ticker')
pricetobook = pd.read_excel(pricetobook_filepath)
pricetobook = pricetobook.set_index('Ticker')
pricetoearnings = pd.read_excel(pricetoearnings_filepath)
pricetoearnings = pricetoearnings.set_index('Ticker')
enterprisevalue = pd.read_excel(enterprisevalue_filepath)
enterprisevalue = enterprisevalue.set_index('Ticker')
retTIndex = retT.index
pricingTIndex = pricingT.index
marketcapIndex = marketcap.index
pricetobookIndex = pricetobook.index
pricetoearningsIndex = pricetoearnings.index
enterprisevalueIndex = enterprisevalue.index
idx = retTIndex.copy()
idx = idx.intersection(pricingTIndex)
idx = idx.intersection(marketcapIndex)
idx = idx.intersection(pricetobookIndex)
idx = idx.intersection(pricetoearningsIndex)
idx = idx.intersection(enterprisevalueIndex)
print("idx.shape=%s" % str(idx.shape))
marketcap = marketcap.loc[
marketcap.index.intersection(idx)]
pricetobook = pricetobook.loc[
pricetobook.index.intersection(idx)]
pricetoearnings = pricetoearnings.loc[
pricetoearnings.index.intersection(idx)]
enterprisevalue = enterprisevalue.loc[
enterprisevalue.index.intersection(idx)]
retT = retT.loc[
retT.index.intersection(idx)]
pricingT = pricingT.loc[
pricingT.index.intersection(idx)]
returns = retT.T
pricing = pricingT.T
print("returns.shape=%s" % str(returns.shape))
print("pricing.shape=%s" % str(pricing.shape))
N_PRIN_COMPONENTS = 50
pca = PCA(n_components=N_PRIN_COMPONENTS)
pca.fit(returns)
print ("marketcap:\t\t%s" % str(marketcap.shape))
print ("pricetobook:\t\t%s" % str(pricetobook.shape))
print ("pricetoearnings:\t%s" % str(pricetoearnings.shape))
print ("enterprisevalue:\t%s" % str(enterprisevalue.shape))
X = np.hstack(
(pca.components_.T,
marketcap['Marketcap'].values[:, np.newaxis],
pricetobook['Pricetobook'].values[:, np.newaxis],
pricetoearnings['Price to Earnings'].values[:, np.newaxis],
enterprisevalue['Enterprisevalue'].values[:, np.newaxis]
)
)
print("X.shape=%s" % str(X.shape))
X = preprocessing.StandardScaler().fit_transform(X)
print("X.shape=%s" % str(X.shape))
clustering_results = build_dbscan_clusters(X, returns.columns, pricing)
# dbscan_cluster_map_all includes datapoints
# belonging to cluster -1 (i.e. datapoints
# belonging to no cluster)
dbscan_cluster_map_all = clustering_results[0]
dbscan_cluster_map = clustering_results[1]
dbscan_counts = clustering_results[2]
dbscan_labels = clustering_results[3]
embed0, embed1, embedding = get_TSNE_embeddings(X)
Points in clusters are colored and opaque. Points not belonging to a cluster (i.e. noise) are gray and translucent.
X_tsne = embedding
# The logical expression returns a
# a list of boolean values
x = X_tsne[(dbscan_labels!=-1), 0]
y = X_tsne[(dbscan_labels!=-1), 1]
# The logical expression returns a
# **series** whose values are booleans
x_noise = X_tsne[(dbscan_cluster_map_all==-1).values, 0]
y_noise = X_tsne[(dbscan_cluster_map_all==-1).values, 1]
fig = plt.figure(1, facecolor='w', figsize=(10, 8))
do_noisy_scatter_plot(x, y, x_noise, y_noise, dbscan_labels, fig)
Apply formula $\frac{N(N - 1)}{2}$ elementwise to each cluster, then sum the results to get the combinatorial sum of possible pairs.
print ("Total pairs possible in universe: %d " %
(counts*(counts-1)/2).sum())
Detect cointegrated pairs in all clusters, then aggregate the results into a single list of pairs.
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
dbscan_cluster_map,
dbscan_counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
Lastly, for the pairs we found and validated, let's visualize them in 2d space with T-SNE again.
fig = plt.figure(1, facecolor='w', figsize=(10, 8))
visualize_cointegrated_pairs(
X,
returns,
X_tsne,
pairs_allclusters,
dbscan_cluster_map,
fig)
universe = "EntireMarket"
returns_file = "returns_2000-2008_len_2485.pkl"
pricing_file = "pricing_2000-2008_len_2485.pkl"
returns = get_market_returns_data(returns_file)
pricing = get_market_pricing_data(pricing_file)
Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " %
(ticker_count*(ticker_count-1)/2))
cluster_info = build_affinity_clusters(universe, returns, pricing)
cov = cluster_info[0]
icov = cluster_info[1]
labels = cluster_info[2]
counts = cluster_info[3]
cluster_map = cluster_info[4]
embed0, embed1, embedding = get_LLE_embeddings(returns)
fig = plt.figure(1, facecolor='w', figsize=(7, 5))
do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=10)
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig)
Detect cointegrated pairs in all clusters, then aggregate the results into a single list of pairs.
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
cluster_map,
counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
fig = plt.figure(1, facecolor='w', figsize=(8, 6))
visualize_cointegrated_pairs(
returns.T,
returns,
embedding,
pairs_allclusters,
cluster_map,
fig)
df = partially_correlated_pairs(universe, returns, icov, 0.4)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.1)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.01)
pairs = find_coint_partially_corr_pairs(df)
universe = "Basic Materials"
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " %
(ticker_count*(ticker_count-1)/2))
cluster_info = build_affinity_clusters(universe, returns, pricing)
cov = cluster_info[0]
icov = cluster_info[1]
labels = cluster_info[2]
counts = cluster_info[3]
cluster_map = cluster_info[4]
embed0, embed1, embedding = get_LLE_embeddings(returns)
fig = plt.figure(1, facecolor='w', figsize=(7, 5))
do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig)
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
cluster_map,
counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
fig = plt.figure(1, facecolor='w', figsize=(8, 6))
visualize_cointegrated_pairs(
returns.T,
returns,
embedding,
pairs_allclusters,
cluster_map,
fig)
df = partially_correlated_pairs(universe, returns, icov, 0.4)
pairs = find_coint_partially_corr_pairs(df)
universe = "Cyclical Consumer"
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " %
(ticker_count*(ticker_count-1)/2))
cluster_info = build_affinity_clusters(universe, returns, pricing)
cov = cluster_info[0]
icov = cluster_info[1]
labels = cluster_info[2]
counts = cluster_info[3]
cluster_map = cluster_info[4]
embed0, embed1, embedding = get_LLE_embeddings(returns)
fig = plt.figure(1, facecolor='w', figsize=(10, 8))
do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig)
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
cluster_map,
counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
fig = plt.figure(1, facecolor='w', figsize=(8, 6))
visualize_cointegrated_pairs(
returns.T,
returns,
embedding,
pairs_allclusters,
cluster_map,
fig)
df = partially_correlated_pairs(universe, returns, icov, 0.4)
pairs = find_coint_partially_corr_pairs(df)
universe = "Energy"
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " %
(ticker_count*(ticker_count-1)/2))
cluster_info = build_affinity_clusters(universe, returns, pricing)
cov = cluster_info[0]
icov = cluster_info[1]
labels = cluster_info[2]
counts = cluster_info[3]
cluster_map = cluster_info[4]
embed0, embed1, embedding = get_LLE_embeddings(returns)
fig = plt.figure(1, facecolor='w', figsize=(10, 8))
do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig)
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
cluster_map,
counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
fig = plt.figure(1, facecolor='w', figsize=(8, 6))
visualize_cointegrated_pairs(
returns.T,
returns,
embedding,
pairs_allclusters,
cluster_map,
fig)
df = partially_correlated_pairs(universe, returns, icov, 0.4)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.2)
pairs = find_coint_partially_corr_pairs(df)
universe = "Financials"
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " %
(ticker_count*(ticker_count-1)/2))
cluster_info = build_affinity_clusters(universe, returns, pricing)
cov = cluster_info[0]
icov = cluster_info[1]
labels = cluster_info[2]
counts = cluster_info[3]
cluster_map = cluster_info[4]
embed0, embed1, embedding = get_LLE_embeddings(returns)
fig = plt.figure(1, facecolor='w', figsize=(10, 8))
do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=30)
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig)
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
cluster_map,
counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
fig = plt.figure(1, facecolor='w', figsize=(8, 6))
visualize_cointegrated_pairs(
returns.T,
returns,
embedding,
pairs_allclusters,
cluster_map,
fig)
df = partially_correlated_pairs(universe, returns, icov, 0.4)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.2)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.1)
pairs = find_coint_partially_corr_pairs(df)
universe = "Healthcare"
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " %
(ticker_count*(ticker_count-1)/2))
cluster_info = build_affinity_clusters(universe, returns, pricing)
cov = cluster_info[0]
icov = cluster_info[1]
labels = cluster_info[2]
counts = cluster_info[3]
cluster_map = cluster_info[4]
embed0, embed1, embedding = get_LLE_embeddings(returns)
fig = plt.figure(1, facecolor='w', figsize=(10, 8))
do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig)
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
cluster_map,
counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
fig = plt.figure(1, facecolor='w', figsize=(8, 6))
visualize_cointegrated_pairs(
returns.T,
returns,
embedding,
pairs_allclusters,
cluster_map,
fig)
df = partially_correlated_pairs(universe, returns, icov, 0.4)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.2)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.1)
pairs = find_coint_partially_corr_pairs(df)
pairs
universe = "Industrials"
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " %
(ticker_count*(ticker_count-1)/2))
cluster_info = build_affinity_clusters(universe, returns, pricing)
cov = cluster_info[0]
icov = cluster_info[1]
labels = cluster_info[2]
counts = cluster_info[3]
cluster_map = cluster_info[4]
embed0, embed1, embedding = get_LLE_embeddings(returns)
fig = plt.figure(1, facecolor='w', figsize=(10, 8))
do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig)
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
cluster_map,
counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
fig = plt.figure(1, facecolor='w', figsize=(8, 6))
visualize_cointegrated_pairs(
returns.T,
returns,
embedding,
pairs_allclusters,
cluster_map,
fig)
df = partially_correlated_pairs(universe, returns, icov, 0.4)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.2)
pairs = find_coint_partially_corr_pairs(df)
universe = "Non-cyclicals"
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " %
(ticker_count*(ticker_count-1)/2))
cluster_info = build_affinity_clusters(universe, returns, pricing)
cov = cluster_info[0]
icov = cluster_info[1]
labels = cluster_info[2]
counts = cluster_info[3]
cluster_map = cluster_info[4]
embed0, embed1, embedding = get_LLE_embeddings(returns)
fig = plt.figure(1, facecolor='w', figsize=(10, 8))
do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig)
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
cluster_map,
counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
fig = plt.figure(1, facecolor='w', figsize=(8, 6))
visualize_cointegrated_pairs(
returns.T,
returns,
embedding,
pairs_allclusters,
cluster_map,
fig)
df = partially_correlated_pairs(universe, returns, icov, 0.4)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.2)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.05)
pairs = find_coint_partially_corr_pairs(df)
universe = "Technology"
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " %
(ticker_count*(ticker_count-1)/2))
cluster_info = build_affinity_clusters(universe, returns, pricing)
cov = cluster_info[0]
icov = cluster_info[1]
labels = cluster_info[2]
counts = cluster_info[3]
cluster_map = cluster_info[4]
embed0, embed1, embedding = get_LLE_embeddings(returns)
fig = plt.figure(1, facecolor='w', figsize=(10, 8))
do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig)
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
cluster_map,
counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
fig = plt.figure(1, facecolor='w', figsize=(8, 6))
visualize_cointegrated_pairs(
returns.T,
returns,
embedding,
pairs_allclusters,
cluster_map,
fig)
df = partially_correlated_pairs(universe, returns, icov, 0.4)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.2)
pairs = find_coint_partially_corr_pairs(df)
df = partially_correlated_pairs(universe, returns, icov, 0.1)
pairs = find_coint_partially_corr_pairs(df)
universe = "Telecom"
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " %
(ticker_count*(ticker_count-1)/2))
cluster_info = build_affinity_clusters(universe, returns, pricing)
cov = cluster_info[0]
icov = cluster_info[1]
labels = cluster_info[2]
counts = cluster_info[3]
cluster_map = cluster_info[4]
embed0, embed1, embedding = get_LLE_embeddings(returns)
fig = plt.figure(1, facecolor='w', figsize=(10, 8))
do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig)
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
cluster_map,
counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
fig = plt.figure(1, facecolor='w', figsize=(8, 6))
visualize_cointegrated_pairs(
returns.T,
returns,
embedding,
pairs_allclusters,
cluster_map,
fig)
df = partially_correlated_pairs(universe, returns, icov, 0.4)
pairs = find_coint_partially_corr_pairs(df)
universe = "Utilities"
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " %
(ticker_count*(ticker_count-1)/2))
cluster_info = build_affinity_clusters(universe, returns, pricing)
cov = cluster_info[0]
icov = cluster_info[1]
labels = cluster_info[2]
counts = cluster_info[3]
cluster_map = cluster_info[4]
embed0, embed1, embedding = get_LLE_embeddings(returns)
fig = plt.figure(1, facecolor='w', figsize=(10, 8))
do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig)
pair_data = aggregate_pairs_allclusters (
universe,
pricing,
cluster_map,
counts)
pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
fig = plt.figure(1, facecolor='w', figsize=(8, 6))
visualize_cointegrated_pairs(
returns.T,
returns,
embedding,
pairs_allclusters,
cluster_map,
fig)
df = partially_correlated_pairs(universe, returns, icov, 0.4)
pairs = find_coint_partially_corr_pairs(df)